6장 - 이질적 처치효과

6.1 ATE에서 CATE로

Why Prediction is not the Answer

CATE with Regression

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import matplotlib

from cycler import cycler

color = ["0.0", "0.4", "0.8"]
default_cycler = cycler(color=color)
linestyle = ["-", "--", ":", "-."]
marker = ["o", "v", "d", "p"]

plt.rc("axes", prop_cycle=default_cycler)

matplotlib.rcParams.update({"font.size": 18})
data = pd.read_csv("../data/daily_restaurant_sales.csv")

data.head()
rest_id day month weekday weekend is_holiday is_dec is_nov competitors_price discounts sales
0 0 2016-01-01 1 4 False True False False 2.88 0 79.0
1 0 2016-01-02 1 5 True False False False 2.64 0 57.0
2 0 2016-01-03 1 6 True False False False 2.08 5 294.0
3 0 2016-01-04 1 0 False False False False 3.37 15 676.5
4 0 2016-01-05 1 1 False False False False 3.79 0 66.0
import statsmodels.formula.api as smf

X = ["C(month)", "C(weekday)", "is_holiday", "competitors_price"]
regr_cate = smf.ols(f"sales ~ discounts*({'+'.join(X)})", data=data).fit()
ols_cate_pred = regr_cate.predict(
    data.assign(discounts=data["discounts"] + 1)
) - regr_cate.predict(data)

Evaluating CATE Predictions

train = data.query("day<'2018-01-01'")
test = data.query("day>='2018-01-01'")
X = ["C(month)", "C(weekday)", "is_holiday", "competitors_price"]
regr_model = smf.ols(f"sales ~ discounts*({'+'.join(X)})", data=train).fit()

cate_pred = regr_model.predict(
    test.assign(discounts=test["discounts"] + 1)
) - regr_model.predict(test)
from sklearn.ensemble import GradientBoostingRegressor

X = ["month", "weekday", "is_holiday", "competitors_price", "discounts"]
y = "sales"

np.random.seed(42)
ml_model = GradientBoostingRegressor(n_estimators=50).fit(train[X], train[y])

ml_pred = ml_model.predict(test[X])
np.random.seed(123)

test_pred = test.assign(
    ml_pred=ml_pred,
    cate_pred=cate_pred,
    rand_m_pred=np.random.uniform(-1, 1, len(test)),
)
test_pred[["rest_id", "day", "sales", "ml_pred", "cate_pred", "rand_m_pred"]].head()
rest_id day sales ml_pred cate_pred rand_m_pred
731 0 2018-01-01 251.5 236.312960 41.355802 0.392938
732 0 2018-01-02 541.0 470.218050 44.743887 -0.427721
733 0 2018-01-03 431.0 429.180652 39.783798 -0.546297
734 0 2018-01-04 760.0 769.159322 40.770278 0.102630
735 0 2018-01-05 78.0 83.426070 40.666949 0.438938

Effect by Model Quantile

from toolz import curry


@curry
def effect(data, y, t):
    return np.sum((data[t] - data[t].mean()) * data[y]) / np.sum(
        (data[t] - data[t].mean()) ** 2
    )
effect(test, "sales", "discounts")
32.16196368039615
def effect_by_quantile(df, pred, y, t, q=10):
    # makes quantile partitions
    groups = np.round(pd.IntervalIndex(pd.qcut(df[pred], q=q)).mid, 2)

    return (
        df.assign(**{f"{pred}_quantile": groups})
        .groupby(f"{pred}_quantile")
        # estimate the effect on each quantile
        .apply(effect(y=y, t=t))
    )


effect_by_quantile(test_pred, "cate_pred", y="sales", t="discounts")
cate_pred_quantile
17.50    20.494153
23.93    24.782101
26.85    27.494156
28.95    28.833993
30.81    29.604257
32.68    32.216500
34.65    35.889459
36.75    36.846889
39.40    39.125449
47.36    44.272549
dtype: float64
import warnings

warnings.filterwarnings("ignore")

fig, axs = plt.subplots(1, 3, sharey=True, figsize=(15, 4))
for m, ax in zip(["rand_m_pred", "ml_pred", "cate_pred"], axs):
    effect_by_quantile(test_pred, m, "sales", "discounts").plot.bar(ax=ax)
    ax.set_ylabel("Effect")

Cumulative Effect

np.set_printoptions(linewidth=80, threshold=10)
def cumulative_effect_curve(dataset, prediction, y, t, ascending=False, steps=100):
    size = len(dataset)
    ordered_df = dataset.sort_values(prediction, ascending=ascending).reset_index(
        drop=True
    )

    steps = np.linspace(size / steps, size, steps).round(0)

    return np.array(
        [effect(ordered_df.query(f"index<={row}"), t=t, y=y) for row in steps]
    )


cumulative_effect_curve(test_pred, "cate_pred", "sales", "discounts")
array([49.65116279, 49.37712454, 46.20360341, ..., 32.46981935, 32.33428884,
       32.16196368])
plt.figure(figsize=(10, 4))

for m in ["rand_m_pred", "ml_pred", "cate_pred"]:
    cumu_effect = cumulative_effect_curve(test_pred, m, "sales", "discounts", steps=100)
    x = np.array(range(len(cumu_effect)))
    plt.plot(100 * (x / x.max()), cumu_effect, label=m)

plt.hlines(
    effect(test_pred, "sales", "discounts"),
    0,
    100,
    linestyles="--",
    color="black",
    label="Avg. Effect.",
)
plt.xlabel("Top %")
plt.ylabel("Cumulative Effect")
plt.title("Cumulative Effect Curve")
plt.legend(fontsize=14)

Cumulative Gain

def cumulative_gain_curve(
    df, prediction, y, t, ascending=False, normalize=False, steps=100
):
    effect_fn = effect(t=t, y=y)
    normalizer = effect_fn(df) if normalize else 0

    size = len(df)
    ordered_df = df.sort_values(prediction, ascending=ascending).reset_index(drop=True)

    steps = np.linspace(size / steps, size, steps).round(0)
    effects = [
        (effect_fn(ordered_df.query(f"index<={row}")) - normalizer) * (row / size)
        for row in steps
    ]

    return np.array([0] + effects)


cumulative_gain_curve(test_pred, "cate_pred", "sales", "discounts")
array([ 0.        ,  0.50387597,  0.982917  , ..., 31.82346463, 32.00615008,
       32.16196368])
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

for m in ["rand_m_pred", "ml_pred", "cate_pred"]:
    cumu_gain = cumulative_gain_curve(test_pred, m, "sales", "discounts")
    x = np.array(range(len(cumu_gain)))
    ax1.plot(100 * (x / x.max()), cumu_gain, label=m)

ax1.plot(
    [0, 100],
    [0, effect(test_pred, "sales", "discounts")],
    linestyle="--",
    label="Random Model",
    color="black",
)

ax1.set_xlabel("Top %")
ax1.set_ylabel("Cumulative Gain")
ax1.set_title("Cumulative Gain Curve")
ax1.legend()


for m in ["rand_m_pred", "ml_pred", "cate_pred"]:
    cumu_gain = cumulative_gain_curve(
        test_pred, m, "sales", "discounts", normalize=True
    )
    x = np.array(range(len(cumu_gain)))
    ax2.plot(100 * (x / x.max()), cumu_gain, label=m)

ax2.hlines(0, 0, 100, linestyle="--", label="Random Model", color="black")

ax2.set_xlabel("Top %")
ax2.set_ylabel("Cumulative Gain")
ax2.set_title("Cumulative Gain Curve (Normalized)")
Text(0.5, 1.0, 'Cumulative Gain Curve (Normalized)')

for m in ["rand_m_pred", "ml_pred", "cate_pred"]:
    gain = cumulative_gain_curve(test_pred, m, "sales", "discounts", normalize=True)
    print(f"AUC for {m}:", sum(gain))
AUC for rand_m_pred: 6.0745233598544495
AUC for ml_pred: -45.44063124684
AUC for cate_pred: 181.74573239200615

Target Transformation

X = ["C(month)", "C(weekday)", "is_holiday", "competitors_price"]

y_res = smf.ols(f"sales ~ {'+'.join(X)}", data=test).fit().resid
t_res = smf.ols(f"discounts ~ {'+'.join(X)}", data=test).fit().resid

tau_hat = y_res / t_res
from sklearn.metrics import mean_squared_error

for m in ["rand_m_pred", "ml_pred", "cate_pred"]:
    wmse = mean_squared_error(tau_hat, test_pred[m], sample_weight=t_res**2)
    print(f"MSE for {m}:", wmse)
MSE for rand_m_pred: 1115.803515760459
MSE for ml_pred: 576256.7425385397
MSE for cate_pred: 42.90447405550281

When Prediction Models are Good for Effect Ordering

Marginal Decreasing Returns

np.random.seed(123)

n = 1000
t = np.random.uniform(1, 30, size=n)
y = np.random.normal(10 + 3 * np.log(t), size=n)

plt.figure(figsize=(10, 4))
plt.scatter(t, y)
plt.ylabel("Y")
plt.xlabel("T")
Text(0.5, 0, 'T')

Binary Outcomes

np.random.seed(123)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

n = 10000
t = np.random.uniform(1, 30, size=n).round(1)
y = (np.random.normal(t, 5, size=n) > 15).astype(int)

df_sim = pd.DataFrame(dict(t=t, y=y)).groupby("t")["y"].mean().reset_index()
df_sim

ax1.scatter(df_sim["t"], df_sim["y"])
ax1.set_ylabel("Y")
ax1.set_xlabel("T")


n = 10000
t = np.random.exponential(1, n).round(1).clip(0, 8)
y = (np.random.normal(t, 2, size=n) > 6).astype(int)

df_sim = (
    pd.DataFrame(dict(t=t, y=y, size=1))
    .query("t<6")
    .groupby("t")
    .agg({"y": "mean", "size": "sum"})
    .reset_index()
)

sns.scatterplot(data=df_sim, y="y", x="t", size="size", ax=ax2, sizes=(5, 100))
ax2.set_ylabel("Default Prob.")
ax2.set_xlabel("Loan (in 100k)")


n = 10000
t = np.random.beta(2, 2, n).round(2) * 10
y = (np.random.normal(5 + t, 4, size=n) > 0).astype(int)

df_sim = (
    pd.DataFrame(dict(t=t, y=y, size=1))
    .groupby("t")
    .agg({"y": "mean", "size": "sum"})
    .reset_index()
)

sns.scatterplot(data=df_sim, y="y", x="t", size="size", ax=ax3, sizes=(5, 100))
ax3.set_ylabel("Conversion")
ax3.set_xlabel("Discount (%)")
plt.tight_layout()

CATE for Decision Making

def demand(price, tau):
    return 50 - tau * price


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

prices = np.linspace(1, 15)

for i, tau in enumerate([1, 2, 3]):
    q = demand(prices, tau)
    ax1.plot(prices, q, color=f"C{i}", label=f"te={tau}")
    ax2.plot(prices, q * prices, color=f"C{i}", label=f"te={tau}")

ax1.set_ylabel("Demand")
ax1.set_xlabel("Price")

ax2.set_ylabel("Revenues")
ax2.set_xlabel("Price")

ax1.legend()

def cost(q):
    return q * 3


def profit(price, tau):
    q = demand(price, tau)
    return q * price - cost(q)


prices = np.linspace(1, 30)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

for i, tau in enumerate([1, 2, 3]):
    profits = profit(prices, tau)

    max_price = prices[np.argmax(profits)]

    ax1.plot(prices, cost(demand(prices, tau)), label=f"te={tau}")

    ax2.plot(prices, profits, color=f"C{i}", label=f"te={tau}")
    ax2.vlines(max_price, 0, max(profits), color=f"C{i}", linestyle="--")
    ax2.set_ylim(-400, 600)


ax2.hlines(0, 0, max(prices), color="black")
# ax2.legend()
ax2.set_ylabel("Profits")
ax2.set_xlabel("Price")
ax1.legend()
ax1.set_ylabel("Cost")
ax1.set_xlabel("Price")

plt.tight_layout()

Key Ideas